Inherited parameter and data from the import tab.
filtered_data <- read.csv("../data/data-2023-09-11 (2).csv", header = TRUE)
selected_stats <- c("Ho", "Hs", "Ht", "Fis (W&C)", "Fst (W&C)", "Fis (Nei)", "Fst (Nei)")
n_rep = 100
n_marker = 6
n_pop = 8
sequence_length <- length(6:11)
library(hierfstat)
library(adegenet)
filtered_data <- data.frame(indv = paste(substr(filtered_data$Population, 1, 3),
row.names(filtered_data), sep = "."), filtered_data)
# Create mydata_genind
population <- filtered_data$Population
mydata_genind <- df2genind(X = filtered_data[, 6:11], sep = "/", ncode = 6, ind.names = filtered_data$indv,
pop = filtered_data$Population, NA.char = "0/0", ploidy = 2, type = "codom",
strata = NULL, hierarchy = NULL)
mydata_genind
mydata_hierfstat <- genind2hierfstat(mydata_genind)
# Libraries
library("poppr")
## Registered S3 method overwritten by 'pegas':
## method from
## print.amova ade4
## This is poppr version 2.9.4. To get started, type package?poppr
## OMP parallel support: available
library("heatmaply")
## Loading required package: plotly
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## Loading required package: viridis
## Loading required package: viridisLite
## Registered S3 method overwritten by 'dendextend':
## method from
## rev.hclust vegan
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust vegan
##
## ======================
## Welcome to heatmaply version 1.5.0
##
## Type citation('heatmaply') for how to cite the package.
## Type ?heatmaply for the main documentation.
##
## The github page is: https://github.com/talgalili/heatmaply/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/heatmaply/issues
## You may ask questions at stackoverflow, use the r and heatmaply tags:
## https://stackoverflow.com/questions/tagged/heatmaply
## ======================
missing_data <- info_table(mydata_genind, plot = FALSE)
# Matrix format
mat <- as.matrix(missing_data)
# heatmap
p <- heatmaply(mat, dendrogram = "none", xlab = "", ylab = "", main = "", scale = "column",
margins = c(60, 100, 40, 20), grid_color = "white", grid_width = 1e-05, titleX = FALSE,
hide_colorbar = TRUE, branches_lwd = 0.1, label_names = c("Population", "Marker",
"Value"), fontsize_row = 8, fontsize_col = 8, labCol = colnames(mat), labRow = rownames(mat),
heatmap_layers = theme(axis.line = element_blank()))
p
library(pegas)
## Loading required package: ape
##
## Attaching package: 'ape'
## The following objects are masked from 'package:hierfstat':
##
## pcoa, varcomp
##
## Attaching package: 'pegas'
## The following object is masked from 'package:ape':
##
## mst
## The following object is masked from 'package:ade4':
##
## amova
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:ape':
##
## where
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tibble)
result <- basic.stats(mydata_hierfstat) #hierfstat
df_resutl_basic <- as.data.frame(result$perloc)
# Weir and Cockrham estimates of Fstatistics - FIS and FST
result_f_stats <- Fst(as.loci(mydata_genind), na.alleles = "") #pegas
result_f_stats <- result_f_stats[, 2:3]
colnames(result_f_stats) <- c("Fst (W&C)", "Fis (W&C)")
result_f_stats <- merge(result_f_stats, df_resutl_basic, by = "row.names", all.x = TRUE)
colnames(result_f_stats)[10] <- "Fst (Nei)"
colnames(result_f_stats)[12] <- "Fis (Nei)"
result_f_stats <- result_f_stats %>%
column_to_rownames(., var = "Row.names")
result_f_stats_selec <- result_f_stats %>%
select(all_of(selected_stats))
result_f_stats_selec
library(hrbrthemes)
library(ggplot2)
# compute the Gstats
result_f_stats <- result_f_stats %>%
mutate(GST = 1 - Hs/Ht)
result_f_stats <- result_f_stats %>%
mutate(`GST''` = (n_pop * (Ht - Hs))/((n_pop * Hs - Ht) * (1 - Hs)))
result_f_stats
# Plot with linear trend
p2 <- ggplot(result_f_stats, aes(x = GST, y = Hs)) + geom_point() + geom_smooth(method = lm,
color = "red", se = FALSE) + theme_ipsum()
p2
## `geom_smooth()` using formula = 'y ~ x'
library("pegas")
hw.test(as.loci(mydata_genind), B = n_rep)
## chi^2 df Pr(chi^2 >) Pr.exact
## B12 693.49866 15 0.0000000 0.00
## C07 27.78220 28 0.4760334 0.09
## D12 799.33270 66 0.0000000 0.00
## D10 492.53039 28 0.0000000 0.00
## A12 11.89948 10 0.2918403 0.26
## C03 56.70154 36 0.0153672 0.10
library(radiator)
# https://thierrygosselin.github.io/radiator/reference/genomic_converter.html
# https://thierrygosselin.github.io/radiator/articles/rad_genomics_computer_setup.html
# mydata1 <- genomic_converter(mydata_genind, parallel.core =
# parallel::detectCores() - 1, output = 'genepop', filename =
# 'mydata.genepop.txt')
library(genepop)
# https://cran.r-project.org/web/packages/genepop/genepop.pdf genepop_HW <-
# test_HW( inputFile =
# '05_radiator_genomic_converter_20231017@1623/mydata.genepop.txt', which =
# 'Proba', outputFile = '', settingsFile = '', enumeration = FALSE,
# dememorization = 10000, batches = 20, iterations = 5000, verbose =
# interactive() )
I have found different values from Fstats.
The index of association was originally developed by A.H.D. Brown analyzing population structure of wild barley (Brown, 1980).
Ia: The index of association. p.Ia: The p-value resulting from a one-sided permutation test based on the number of samples indicated in the original call. rbard: The standardized index of association. p.rd: The p-value resulting from a one-sided permutation test based on the number of samples indicated in the original call.
pair.ia calculates the index of association in a pairwise manner among all loci.
loci_pair <- pair.ia(mydata_genind, sample = n_rep, quiet = FALSE, method = 1,plot = FALSE)
loci_pair
## Ia p.Ia rbarD p.rD
## B12:C07 -0.0039 0.9901 -0.0039 0.9901
## B12:D12 0.0545 0.8317 0.0547 0.8317
## B12:D10 0.0204 0.2079 0.0204 0.2079
## B12:A12 0.0356 0.0099 0.0356 0.0099
## B12:C03 0.0531 0.1683 0.0534 0.1782
## C07:D12 -0.0233 0.8317 -0.0233 0.8317
## C07:D10 0.0172 0.4059 0.0172 0.4059
## C07:A12 0.0364 0.5941 0.0365 0.5941
## C07:C03 0.1252 0.0693 0.1254 0.0693
## D12:D10 0.0708 0.0099 0.0709 0.0198
## D12:A12 0.0244 0.1584 0.0245 0.1584
## D12:C03 -0.0135 0.6139 -0.0135 0.6139
## D10:A12 0.0170 0.3465 0.0170 0.3465
## D10:C03 0.0456 0.1089 0.0458 0.1089
## A12:C03 0.0284 0.0990 0.0287 0.0990
“Genotypes at one locus are independent from genotypes at the other locus”. For a pair of diploid loci, no assumption is made about the gametic phase in double heterozygotes. In particular, it is not inferred assuming one-locus HW equilibrium, as such equilibrium is not assumed anywhere in the formulation of the test. The test is thus one of association between diploid genotypes at both loci, sometimes described as a test of the composite linkage disequilibrium (Bruce S. Weir 1996, 126–28). Contingency tables are created for all pairs of loci in each sample, then a G test or a probability test for each table is computed for each table using the Markov chain algorithm of Raymond and Rousset (1995a). The number of switches of the algorithm is given for each table analyzed.
library(genepop)
outfile <- test_LD(inputFile = "05_radiator_genomic_converter_20231017@1623/mydata.genepop.txt",
outputFile = "", settingsFile = "", dememorization = 10000, batches = 100, iterations = n_rep,
verbose = interactive())
readLines(outfile)[136:155]
## [1] "P-value for each locus pair across all populations"
## [2] "(Fisher's method)"
## [3] "-----------------------------------------------------"
## [4] "Locus pair Chi2 df P-Value"
## [5] "-------------------- -------- --- --------"
## [6] "A12 & B12 9.909120 16 0.871331"
## [7] "A12 & C03 21.451489 16 0.161801"
## [8] "B12 & C03 21.526163 16 0.159161"
## [9] "A12 & C07 8.073544 16 0.946647"
## [10] "B12 & C07 15.275568 16 0.504556"
## [11] "C03 & C07 19.524641 16 0.242397"
## [12] "A12 & D10 13.526471 16 0.633945"
## [13] "B12 & D10 8.497647 16 0.932653"
## [14] "C03 & D10 17.076849 16 0.380641"
## [15] "C07 & D10 12.299711 16 0.723103"
## [16] "A12 & D12 12.043688 16 0.740967"
## [17] "B12 & D12 12.438999 16 0.713248"
## [18] "C03 & D12 6.951916 16 0.974177"
## [19] "C07 & D12 28.672296 16 0.026242"
## [20] "D10 & D12 14.840108 16 0.536379"
LD2 is based on the observed frequencies of different genotypes (Schaid 2004).
mat_alleles <- filtered_data %>%
select(Population)
mat_alleles <- cbind(mat_alleles, filtered_data[, 6:11])
mat_alleles_loci <- alleles2loci(mat_alleles, ploidy = 1, population = 1, phased = FALSE)
linkage_pegas <- LD2(mat_alleles_loci)
print(linkage_pegas$T2)
## T2 df P-val
## 69.43451606 48.00000000 0.02313167
DEVELOPMENT
not ready for deployment
#shuffled_matrices <- replicate(n_rep, mat[sample(nrow(mat)), ], simplify = FALSE)
shuffled_matrices <- replicate(n_rep, mat[sample(length(mat), replace = FALSE)], simplify = FALSE)
##################
# shuffle only the genotype and add the pop column later for each matrices.
#in the loop?
###############
# Create a list to store the wc
fst_df <- numeric(sequence_length)
fis_df <- numeric(sequence_length)
# Calculate the average for each shuffled matrix
# Iterate through the shuffled matrices
for (i in 1:n_rep) {
# Calculate the statistics for the i-th matrix
#HERE THE COLUMN POP
merged_df <- cbind(level_pop, shuffled_matrices[[i]])
result_f_stats <- wc(shuffled_matrices[[i]])
result_f_stats <- as.data.frame(result_f_stats$per.loc)
# Extract FST and FIS values
fst_values <- result_f_stats$FST
fis_values <- result_f_stats$FIS
print( fst_values)
# Assign values to the data frames
fst_df <- cbind(fst_df, result_f_stats$FST)
fis_df <- cbind(fis_df, result_f_stats$FIS)
}
# Set row names as in result_f_stats
rownames(fst_df) <- rownames(fis_df) <- rownames(result_f_stats)
result_FST <- fst_df[, -1]
fis_df <-fis_df[, -1]
vec <- seq(1, n_rep)
colnames(result_FST) <- colnames(fis_df) <- vec
result_FST[1,]
count (result_f_stats[,1][1] > result_FST[1,] )
count <- sum(result_f_stats[,1][1] > result_FST[1, ])
# Initialize an empty data frame to store the counts
count_df <- data.frame(
Greater = numeric(length(result_FST)),
Smaller = numeric(length(result_FST))
)
# Compare the values in result_f_stats[1] to result_FST for each column
for (col in colnames(result_FST)) {
greater_count <- sum(result_f_stats[1] > result_FST[, col])
smaller_count <- sum(result_f_stats[1] < result_FST[, col])
count_df$Greater[col] <- greater_count
count_df$Smaller[col] <- smaller_count
}
# Print the count data frame
print(count_df)
######################## ########################
library(radiator)
genomic_converter(
data,
strata = NULL,
output = NULL,
filename = NULL,
parallel.core = parallel::detectCores() - 1,
verbose = TRUE,
...
)